import pandas as pd

#import sys
#args = sys.argv
#c = int(args[1])
c = 1000

#df = pd.read_csv('../output/final_data/narratives_complete_with_metadata_{0}.csv'.format(c))
df = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_{0}_no_frequency_filter.csv'.format(c))
df1 = df[['ARGO', 'ARGO-RAW']]
df1.rename(columns={'ARGO': 'ARG', 'ARGO-RAW': 'ARG-RAW'}, inplace=True)

df2 = df[['ARG1', 'ARG1-RAW']]
df2.rename(columns={'ARG1': 'ARG', 'ARG1-RAW': 'ARG-RAW'}, inplace=True)

df = df1.append(df2)

df = df.fillna('')

df = df.groupby(['ARG', 'ARG-RAW']).size().reset_index()

df.columns = ['ARG', 'ARG-RAW', 'count']

df = df.groupby(['ARG']).apply(lambda x: x.sort_values(["count"], ascending = False))
df = df.reset_index(drop= True)
df = df.groupby(['ARG']).head(10)

df['ARG-RAW'] = df['ARG-RAW'] + ' - ' + df['count'].astype(str)
df['cluster_elements'] = df.groupby(['ARG'])['ARG-RAW'].transform(lambda x: ' | '.join(x))

for i in df.ARG.unique():
    if len(df[df.ARG == i].cluster_elements.unique()) > 1:
        print(i)

df = df.drop_duplicates(subset=['ARG'])

df['cluster_elements'] = [', '.join(set(i.split(','))) for i in list(df['cluster_elements'])]

df['cluster_label_manual'] = ''

print('Clusters to inspect:', len(df))

df = df[['ARG', 'cluster_elements', 'cluster_label_manual']]

df.to_csv('../data/metadata/gpo_manual_cluster_labels/clusters_to_label_{0}.csv'.format(c))
